In [4]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
plt.style.use('ggplot')
import dateutil.parser
In [5]:
# Note to self for later: http://stackoverflow.com/a/17001474
In [6]:
# Data from https://www.census.gov/econ/currentdata/datasets/
data_df = pd.read_excel("RESSALES-mf.xlsx", sheetname='data')
data_df.head()
Out[6]:
In [7]:
categories_df = pd.read_excel("RESSALES-mf.xlsx", sheetname='categories')
data_types_df = pd.read_excel("RESSALES-mf.xlsx", sheetname='data_types')
error_types_df = pd.read_excel("RESSALES-mf.xlsx", sheetname='error_types')
geo_levels_df = pd.read_excel("RESSALES-mf.xlsx", sheetname='geo_levels')
periods_df = pd.read_excel("RESSALES-mf.xlsx", sheetname='periods')
In [8]:
categories_df.head(2)
Out[8]:
In [9]:
# it auto-merges cat_idx in our original dataframe with cat_idx in categories_df
# it auto-merges dt_idx in our original dataframe with dt_idx in data_types_df
# it auto-merges geo_idx in our original dataframe with geo_idx in geo_levels_df
# it auto-merges per_idx in our original dataframe with per_idx in periods_df
df = data_df.merge(categories_df).merge(data_types_df).merge(geo_levels_df).merge(periods_df)
# We only want to look at the total number of homes sold across entire the united states
df = df[(df['cat_code'] == 'SOLD') & (df['geo_code'] == 'US') & (df['dt_code'] == 'TOTAL')]
In [10]:
# We don't merge error_types_df because all of the errors are the same
df['et_idx'].value_counts()
Out[10]:
In [11]:
df.head(2)
Out[11]:
In [12]:
# Now let's remove the join columns to keep things clean
df = df.drop(['per_idx', 'cat_idx', 'dt_idx', 'et_idx', 'geo_idx'], axis=1)
df.head()
Out[12]:
In [13]:
# At least we can see 'per_name' (period name) is already a datetime!
df.info()
In [ ]:
#change the index to be the datetime
In [14]:
df.head(3)
Out[14]:
In [16]:
df.index = df['per_name']
df.head(3)
Out[16]:
In [17]:
del df['per_name']
In [18]:
df.head(3)
Out[18]:
In [ ]:
# Select specific dates via the index
In [19]:
mylist = ['cat', 'dog', 'frog', 'log']
In [20]:
mylist[0]
Out[20]:
In [27]:
#pick everything from a given year
df['1987']['val'].sum()
Out[27]:
In [28]:
df['1980']['val'].sum()
Out[28]:
In [29]:
mylist
Out[29]:
In [30]:
mylist[:2]
Out[30]:
In [31]:
mylist[2:4]
Out[31]:
In [33]:
df['2001':].head(10)
Out[33]:
In [34]:
df['1965-06':'1972-02']
Out[34]:
In [35]:
df[:'1990']
Out[35]:
In [36]:
df['1980':'1990']['val'].sum()
Out[36]:
In [37]:
df['1990':'2000']['val'].sum()
Out[37]:
In [41]:
df[:'1999']['val'].median()
Out[41]:
In [42]:
df['2000':]['val'].median()
Out[42]:
In [43]:
df.plot(y='val')
Out[43]:
In [45]:
df['1980'].plot(y='val')
Out[45]:
In [46]:
df['1966'].plot(y='val')
Out[46]:
In [49]:
df.resample('A').sum()
Out[49]:
In [50]:
df.resample('A').sum().plot(y='val')
Out[50]:
In [51]:
df.resample('Q').sum().plot(y='val')
Out[51]:
In [52]:
#Collecting into buckets of 5 years
df.resample('5A').sum().plot(y='val')
Out[52]:
In [53]:
#Collecting into buckets of 10 years
df.resample('10A').sum().plot(y='val')
Out[53]:
In [55]:
df.plot(y='val')
df.resample('A').median().plot(y='val')
df.resample('5A').median().plot(y='val')
Out[55]:
In [65]:
ax = df.plot(y='val', label='Monthly', alpha=0.5)
df.resample('A').median().plot(y='val', ax=ax, label='Annually')
df.resample('5A').median().plot(y='val', ax=ax, label='Decade')
ax.set_title("Homes in Crisis")
ax.set_xlabel("Time")
ax.set_ylabel("New houses sold")
#ax.set TAB
plt.savefig("TEST.svg")
In [59]:
df.index.name = 'Period Name'
In [66]:
import datetime
datetime.datetime(2016, 6, 7, 14, 30).month
Out[66]:
In [68]:
#df['val'] / 4
In [69]:
df.index.month
Out[69]:
In [71]:
#For a given month, what is the average number of houses sold
#so, we group by the month
#and then takt the median of each group
#Each of the month numbers in the index
df.groupby(by=df.index.month).median()
Out[71]:
In [72]:
df.groupby(by=df.index.month).median().plot(y='val')
Out[72]:
In [76]:
#improving the graph
ax = df.groupby(by=df.index.month).median().plot(y='val', legend=False)
ax.set_xticks([1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12])
ax.set_xticklabels(['Jan', 'Feb', 'Mar', 'Apr', 'May', 'Jun', 'Jul', 'Aug', 'Sep', 'Oct', 'Nov', 'Dec'])
ax.set_ylabel('Houses sold (in thousands)')
ax.set_title('Houses sales by month, 1963-2016')
Out[76]:
In [79]:
#average sold per month
df.groupby(by=df.index.month).median()
#The least sold in any month (e.g. the worst January ever, the worst July ever)
df.groupby(by=df.index.month).min()
#The best sold in any month (e.g. the best January ever, the best July ever)
df.groupby(by=df.index.month).max()
Out[79]:
In [83]:
#average sold per month
ax = df.groupby(by=df.index.month).median().plot(y='val', Label='Average')
#The least sold in any month (e.g. the worst January ever, the worst July ever)
df.groupby(by=df.index.month).min().plot(y='val', ax=ax, Label='Least sold')
#The best sold in any month (e.g. the best January ever, the best July ever)
df.groupby(by=df.index.month).max().plot(y='val', ax=ax, Label='Most sold')
Out[83]:
In [91]:
#Make a graph of the average
ax = df.groupby(by=df.index.month).median().plot(y='val', Label='Average')
x_values = df.groupby(by=df.index.month).median().index
#get mininum
min_values = df.groupby(by=df.index.month)['val'].min()
#get the maximum
max_values = df.groupby(by=df.index.month)['val'].max()
ax.fill_between([1,2,3,4,5,6,7,8,9,10,11,12],min_values, max_values, alpha=0.3)
ax.set_title('House sales go up and down')
Out[91]:
In [89]:
df.groupby(by=df.index.month).median().index
Out[89]:
In [94]:
ax = df.resample('A').median().plot(y='val')
x_values = df.resample('A').median().index
min_values = df.resample('A')['val'].min()
max_values = df.resample('A')['val'].max()
ax.fill_between(x_values, max_values, min_values, alpha=0.3)
Out[94]:
In [ ]:
In [ ]: